"""
Joshua Arribere Feb 24, 2014

Script to collapse reads containing the exact same sequence. Will pick the read information
    from the first occurence of that read for the outFile.

Input: reads.fastq - fastq-formatted reads

Output: reads.collapsed.fastq - same as input, with redundant reads removed

run as python readCollapser.py inFile.fastq outPrefix
"""

import sys, collections, numpy

def main(args):
    inFile,outPrefix=args[0:]
    
    reads=collections.defaultdict(lambda:collections.defaultdict(int))
    with open(inFile,'r') as f:
        with open(outPrefix,'w') as g:
            currRead=[]
            for line in f:
                line=line.strip()
                if len(currRead)==4:
                    if reads[currRead[1][:-6]][currRead[1][-6:]]==0: #if True:# for NOT COLLAPSING
                        
                        g.write('%s\n%s\n%s\n%s\n'%(currRead[0],currRead[1][:-6],currRead[2],currRead[3][:-6]))
                        #must trim quality scores and reads simultaneously. Hence the double -6
                    reads[currRead[1][:-6]][currRead[1][-6:]]+=1
                    currRead=[line]
                else:
                    currRead.append(line)

if __name__=='__main__':
    """run as python readCollapser.py inFile.fastq outPrefix"""
    main(sys.argv[1:])
